library(here)

source(file.path(here(), "src","00_functions.R"))
source(file.path(here(), "src","00_wordfish_1.3_source.R"))

load(file="dat.file_AllTopics.Rdata")
dim(dat)


# Subset crime issues:
dat <- dat[dat$MacroArgomento=="Cronaca",]

# Create text_quanteda indicator
dat$tmp <- 1:nrow(dat)
dat$text_quanteda <- paste0("text",dat$tmp)
dat$tmp <- NULL
dim(dat)


# Clean the text (English-based pre-processing packages will fail, e.g.: "l'amico" -> "lamico)
names(dat)
library(dplyr)
dat <- dplyr::select(dat,   # newname = oldname
                     doc_id=Documento,                       # required
                     text=Contenuto.Audio..Trascrizione.,    # required
                     Argomento=Argomento,
                     Titolo=Titolo,
                     Edizione=Edizione,
                     date=Data.Trasmissione,
                     canale=Canale.Di.Trasmissione,
                     MacroArgomento=MacroArgomento,
                     duration=duration,
                     Time.Start=Time.Start,
                     Time.Stop=Time.Stop,
                     text_quanteda=text_quanteda)

# Previously created stopword list
load(file="stopw.file.Rdata")

# Also text to lower
dat$text <- tolower(dat$text)

# Remove all spaces first (many "\n" and "\t" in the text)
dat$text <- gsub("\n"," ",dat$text)
dat$text <- gsub("\t"," ",dat$text)
dat$text <- gsub("\r"," ",dat$text)
dat$text <- gsub("\\\\"," ",dat$text)
dat$text <- gsub("\""," ",dat$text)

# Remove punctuation while adding a space (or: "lamaca")
dat$text <- gsub("[[:punct:]]"," ", dat$text)

# Naming the entries:
dat$doc_name_long <- paste(dat$Titolo,dat$date,dat$Edizione,dat$docs,sep="_")

# tm requries first variable to be "doc_id" and second "text"
#dat <- dat[c("doc_id","text","Argomento","docs","Titolo","Edizione","date","canale","MacroArgomento","duration","Time.Start","Time.Stop","doc_id")]

# Encoding UTF-8:
dat$text <- iconv(dat$text, to="UTF-8")

# Removing stopwords:
library("tm")
dat$doc_id <- as.character(dat$doc_id)
dat$Argomento <- as.character(dat$Argomento)
save(dat,file="dat.cron.Rdata")

# Paste all transcripts from the same edition:
dat.edition <- aggregate(dat$text, 
                         list(dat$doc_name_long), 
                         paste, 
                         collapse=" ")
names(dat.edition) <- c("id","text")

library(dplyr)
dat.tmp <- dat
dat.tmp <- subset(dat.tmp,
                  select=c(Titolo, Edizione, date, canale, MacroArgomento, doc_name_long))
# Delete duplicates:
dat.tmp <- dat.tmp[!duplicated(dat.tmp),]
dat.edition <- left_join(x=dat.edition, 
                         y=dat.tmp, 
                         by=c("id"="doc_name_long"))
rm(dat.tmp)

# Delete missing
which(is.na(dat.edition$canale))
dat.edition <- dat.edition[-1,]
save(dat.edition, file="dat.edition.cron.Rdata")

library(tm)
# From: https://cran.r-project.org/web/packages/tm/vignettes/extensions.pdf
m <- list(content = "text",   # has to be "content"
          Titolo="Titolo", Edizione="Edizione",date="date",
          canale="canale", MacroArgomento="MacroArgomento", 
          id="id")
myReader <- readTabular(mapping = m)
corpus.tm <- VCorpus(DataframeSource(dat.edition), readerControl = list(reader = myReader))
class(corpus.tm)

# Avoid calling some `quanteda` functions
library(quanteda)
tmp.pack<-"package:quanteda"
detach(tmp.pack, character.only = T)

# Final cleaning:
#       1. numbers
#       2. stopwords removal *before* stemming
#       3. stemming in Italian, still not possible in quanteda
corpus.tm <- tm_map(corpus.tm, removeNumbers)
corpus.tm <- tm_map(corpus.tm, removeWords, stopw)
corpus.tm <- tm_map(corpus.tm, stemDocument, language = "italian")
save(corpus.tm,file="corpus.tm.clean.Rdata")

rm(dat, dat.edition, dat.tmp, m, myReader)

# Now we can switch back to quanteda:
library("quanteda")
load(file="corpus.tm.clean.Rdata")
corpus.cron <- quanteda::corpus(corpus.tm)

# Create the dfm
dfm.cron <- dfm(corpus.cron)
dim(dfm.cron)
# [1] 16516 56306

dfm.mat.cron <- as.matrix(dfm.cron)

# Drop infrequent words:
WordsUsage <- apply(dfm.mat.cron, 2, sum)
#Bottom <- quantile(WordsUsage, probs = c(0.05))
length(which(WordsUsage<20))

# Delete words in top percentiles
Top <- quantile(WordsUsage, probs = c(0.90))
length(which(WordsUsage>Top))
# [1] 2816      # top 5%
# [1] 5630      # top 10%
WordsToRemove <- which(WordsUsage<20 | WordsUsage > Top)
length(WordsToRemove)
# [1] 19549
# [1] 31998  (top 10% & <20)
dfm.mat.cron <- dfm.mat.cron[, -WordsToRemove]

# delete documents with < ? words
WordsInDocs <- apply(X = dfm.mat.cron, MARGIN = 1, FUN=sum)
summary(WordsInDocs)
dfm.mat.cron <- dfm.mat.cron[-which(WordsInDocs < 20),]
dim(dfm.mat.cron)

save(dfm.mat.cron, file="dfm.mat.cron.Rdata")

# Back to quanteda and estimate
library(quanteda)
dfm.cron <- as.dfm(dfm.mat.cron)
wordfish.out.cron <- textmodel_wordfish(dfm.cron)
save(wordfish.out.cron, file="wordfish.out.cron.Rdata")


# Exctract and plot results -----
load("wordfish.out.cron.Rdata")
load("dat.cron.Rdata")

library("quanteda")
# omega.cron <- coefficients(wordfish.out.cron) 
sum.wordfish.cron <- summary(wordfish.out.cron)
sum.wordfish.cron$text <- rownames(sum.wordfish.cron)

tmp1 <- wordfish.out.cron@theta
tmp2 <- wordfish.out.cron@docs
tmp3 <- as.data.frame(cbind(tmp1,tmp2), stringsAsFactors = FALSE)
names(tmp3) <- c("theta","text_quanteda")

dat$Time.Start <- dat$Time.Stop <- NULL
dat$text_quanteda<-dat$doc_name_long
dat <- right_join(tmp3, dat, by="text_quanteda")
dat$theta <- as.numeric(dat$theta) 

save(file="dat.cron.wordfish.Rdata")

#Plotting:
load("dat.cron.wordfish.Rdata")
dat.plot <- dat[,c("theta", "date", "Titolo")]
names(dat.plot) <- c("theta", "date", "TG")
ggplot(dat.plot, aes(x=date, y=theta, colour=TG))+ 
        geom_point()+
        ggtitle("WORDFISH scaling on Crime news") +
        ylab("Omega scores")+
        theme(plot.title = element_text(size = 20, face = "bold")) +
        stat_smooth(method="loess", level=0.99)+
        guides(fill=guide_legend(title=NULL))+
        theme(legend.text=element_text(face="bold",size=16))+theme_bw()
ggsave("scalingCrime.pdf")